For the reader: You can click the black “code” box on the right hand side to either see or hide the R code behind analysis results and graphs.

The results are arranged to separate tabs by site/area, so that it’s easy to look at just the results of the site of interest, instead of scrolling accidentally down to a different site and getting confused.

The results for each site consist of four parts: pXRF, particle size distribution, loss on ignition, and colorimetry.

Sites

Ashdod-Yam

Intro about the site

pXRF

pXRF

From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:

Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)

The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.

We end up with 20 elements in the “pxrf_all” dataset, and 12 elements in the “pxrf_no_na” dataset.

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

# Read the raw data
pxrf <- read.xlsx("data/pxrf_AY.xlsx", sep=";")

# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))

# Turn any "LOD":s and missing values to "NA":s to allow scaling
values <- c("MgO", "Al2O3", "SiO2", 
            "S", "Cl", "K2O", "CaO", "Ti", "V", "Cr", 
            "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As", 
            "Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", 
            "Rh", "Pd", "Ag", "Cd", "Sn", "Sb", 
            "La", "Ce", "Hf", "Ta", "W", "Pt", "Au", 
            "Hg", "Tl", "Pb", "Bi", "Th", "U")

pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)

# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))

# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)

# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"

# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)

# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)

# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)

pxrf_no_na:

glimpse(pxrf_no_na)
## Rows: 47
## Columns: 14
## $ Area  <fct> Area D Wall, Area D Wall, Area D Wall, Area D Wall, Area D Wall,~
## $ Type  <fct> mud mortar, mud plaster, mud plaster, mud plaster, mud plaster, ~
## $ Al2O3 <dbl> -0.83430964, 0.28804627, 0.92438017, -0.75653070, 0.83252855, 0.~
## $ SiO2  <dbl> -1.07372922, 0.46591062, 0.91277017, -0.41768748, 1.37707345, 0.~
## $ K2O   <dbl> -0.9354224, -0.1969237, 0.4729661, -0.8000151, 0.3529005, 0.9275~
## $ CaO   <dbl> -0.31855193, -0.76710354, 1.55087898, -0.52286381, -0.30804896, ~
## $ Ti    <dbl> -0.87635022, -0.13169866, 0.36204744, -0.56350248, 0.12413530, 0~
## $ Mn    <dbl> -0.47857024, -0.36084140, 0.40974742, -0.26251105, 0.07529047, 0~
## $ Fe    <dbl> -0.65915484, -0.30494130, 0.64604506, -0.48601854, 0.28383043, 1~
## $ Cu    <dbl> -0.0382265, -0.9436700, -0.2645874, -0.3777678, -1.0568504, 0.30~
## $ Zn    <dbl> -0.01720942, -0.65101964, -0.39749555, 0.10955263, -0.39749555, ~
## $ Rb    <dbl> 0.36277317, -0.54198093, -0.29522982, 0.67121207, -0.54198093, 0~
## $ Sr    <dbl> -0.11489739, -0.84951611, 0.60409114, -0.36107281, -0.41187092, ~
## $ Zr    <dbl> -1.764549e-01, -7.787123e-01, 6.876654e-05, 1.298656e-01, -3.322~

pxrf_all:

glimpse(pxrf_all)
## Rows: 47
## Columns: 22
## $ Area  <fct> Area D Wall, Area D Wall, Area D Wall, Area D Wall, Area D Wall,~
## $ Type  <fct> mud mortar, mud plaster, mud plaster, mud plaster, mud plaster, ~
## $ MgO   <dbl> -0.51417215, 0.64100358, 0.22606806, -0.12125145, 0.61130727, -0~
## $ Al2O3 <dbl> -0.83430964, 0.28804627, 0.92438017, -0.75653070, 0.83252855, 0.~
## $ SiO2  <dbl> -1.07372922, 0.46591062, 0.91277017, -0.41768748, 1.37707345, 0.~
## $ S     <dbl> -0.2323566, -0.2339645, NA, NA, -0.1447233, NA, NA, NA, NA, NA, ~
## $ Cl    <dbl> -0.45906368, 0.66024144, 0.47946871, -0.29005356, 0.12535227, -0~
## $ K2O   <dbl> -0.9354224, -0.1969237, 0.4729661, -0.8000151, 0.3529005, 0.9275~
## $ CaO   <dbl> -0.31855193, -0.76710354, 1.55087898, -0.52286381, -0.30804896, ~
## $ Ti    <dbl> -0.87635022, -0.13169866, 0.36204744, -0.56350248, 0.12413530, 0~
## $ V     <dbl> NA, NA, -0.009516182, NA, NA, 0.216968943, NA, NA, NA, 0.6861167~
## $ Mn    <dbl> -0.47857024, -0.36084140, 0.40974742, -0.26251105, 0.07529047, 0~
## $ Fe    <dbl> -0.65915484, -0.30494130, 0.64604506, -0.48601854, 0.28383043, 1~
## $ Ni    <dbl> 1.17211765, -0.73894374, -0.73894374, 0.14833477, -0.92094959, 0~
## $ Cu    <dbl> -0.0382265, -0.9436700, -0.2645874, -0.3777678, -1.0568504, 0.30~
## $ Zn    <dbl> -0.01720942, -0.65101964, -0.39749555, 0.10955263, -0.39749555, ~
## $ As    <dbl> NA, NA, -0.0963078, NA, -0.8953058, -0.2960573, NA, 2.3006863, N~
## $ Rb    <dbl> 0.36277317, -0.54198093, -0.29522982, 0.67121207, -0.54198093, 0~
## $ Sr    <dbl> -0.11489739, -0.84951611, 0.60409114, -0.36107281, -0.41187092, ~
## $ Y     <dbl> NA, -0.85601627, -0.05383750, 0.28040365, -0.41036140, 0.3026864~
## $ Zr    <dbl> -1.764549e-01, -7.787123e-01, 6.876654e-05, 1.298656e-01, -3.322~
## $ Nb    <dbl> NA, NA, -0.4829764, NA, -0.8931231, NA, -0.4829764, NA, 0.952536~

pXRF: correlations

Introduction

# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)

# Visualize correlations between elements
cor(pxrf_1[3:12]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")

# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:12], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')

# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))

pXRF: PCA

PCA with only the no-NA:s elements

# Elements with no NA values at all
colnames(pxrf_no_na)
##  [1] "Area"  "Type"  "Al2O3" "SiO2"  "K2O"   "CaO"   "Ti"    "Mn"    "Fe"   
## [10] "Cu"    "Zn"    "Rb"    "Sr"    "Zr"
# PCA
pca_1 <- prcomp(pxrf_1[3:12])
summary(pca_1)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.9037 1.0026 0.8270 0.67213 0.37446 0.35729 0.29419
## Proportion of Variance 0.5809 0.1611 0.1096 0.07242 0.02248 0.02046 0.01387
## Cumulative Proportion  0.5809 0.7421 0.8517 0.92413 0.94661 0.96707 0.98095
##                            PC8     PC9    PC10
## Standard deviation     0.26346 0.16502 0.14906
## Proportion of Variance 0.01113 0.00437 0.00356
## Cumulative Proportion  0.99207 0.99644 1.00000
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA Ashdod-Yam elements")

autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE,  main = "PCA Ashdod-Yam grouped by area")

autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE,  main = "PCA Ashdod-Yam grouped by sample type")

PCA with all the feasible elements

# Elements that have at least one viable value
colnames(pxrf_all)
##  [1] "Area"  "Type"  "MgO"   "Al2O3" "SiO2"  "S"     "Cl"    "K2O"   "CaO"  
## [10] "Ti"    "V"     "Mn"    "Fe"    "Ni"    "Cu"    "Zn"    "As"    "Rb"   
## [19] "Sr"    "Y"     "Zr"    "Nb"
# PCA with NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0

pca_2 <- prcomp(pxrf_all[3:22])
summary(pca_2)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.1392 1.2623 1.1731 0.77694 0.75938 0.56778 0.50254
## Proportion of Variance 0.4455 0.1551 0.1340 0.05877 0.05614 0.03138 0.02459
## Cumulative Proportion  0.4455 0.6006 0.7346 0.79337 0.84951 0.88090 0.90548
##                            PC8     PC9   PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.44075 0.42941 0.3738 0.33174 0.29859 0.26048 0.24174
## Proportion of Variance 0.01891 0.01795 0.0136 0.01071 0.00868 0.00661 0.00569
## Cumulative Proportion  0.92439 0.94235 0.9559 0.96666 0.97534 0.98195 0.98764
##                           PC15    PC16    PC17    PC18    PC19    PC20
## Standard deviation     0.22407 0.16224 0.16055 0.12292 0.08709 0.04448
## Proportion of Variance 0.00489 0.00256 0.00251 0.00147 0.00074 0.00019
## Cumulative Proportion  0.99253 0.99509 0.99760 0.99907 0.99981 1.00000
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA Ashdod-Yam more elements")

autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE,  main = "PCA Ashdod-Yam more elements grouped by area")

autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE,  main = "PCA Ashdod-Yam more elements grouped by sample type")

pXRF: HCA

Introduction

# HCA

# compute divisive hierarchical clustering
hc4 <- diana(pxrf_1[3:12])

# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.8007861
# Diana dendrogram
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")

# HCA dendrogam 1
dend <- 
    pxrf_1[3:12] %>%                    # data
    dist %>%                            # calculate a distance matrix
    hclust(method = "ward.D2") %>%      # hierarchical clustering 
    as.dendrogram %>%                   # turn the object into a dendrogram
    set("branches_k_color", k=5) %>% 
    set("branches_lwd", 1.2) %>%
    set("labels_col", c("red", "black", "blue"), k=5) %>%    
    set("labels_cex", 1) %>% 
    set("leaves_pch", NA) 
# plot the dend in usual "base" plotting engine:
plot(dend)

# HCA dendrogam 2: 
HCA.ward5 <- hclust(dist(pxrf_1[3:12], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)

Grain size analysis

Particle size

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots


## Particle size
# Read and filter data
particle <- read.xlsx("data/grain_AY.xlsx", sep=";")

# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>% 
  filter(!grepl('Average', Sample)) %>% 
  filter(!grepl('test', Sample))

# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"

# Saving the clean data as "AY_grain2"
## write.xlsx(particle, file="data/grain/AY_grain2.xlsx")

# Manually added Type and Context columns to AY_grain2 data: "AY_grain3"
grain3 <- read.xlsx("data/grain/AY_grain3.xlsx", sep=";")

# Samples as rownames
rownames(grain3) <- grain3$Sample 

# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}

grain <- as.data.frame(apply(grain3[3:5], 2, normalize))

# Ternary plots

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
  labs(title="Clay silt sand") +
  theme_rgbw() +
  theme_nomask() +
  geom_point(size=0) +
  geom_text(aes(label=rownames(grain)), size=3)

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
  labs(title="Clay silt sand") +
  theme_rgbw() +
  theme_nomask() +
  geom_point(size=2)

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
  labs(title="Clay silt sand") +
  theme_nomask() +
  geom_mask() +
  geom_point(size=2) 

Loss on ignition

We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Preparing data

loi <- read.xlsx("data/loi_AY.xlsx", sep=";")
tga <- read.xlsx("data/tga_AY.xlsx", sep=";")

summary(loi)
##     sample             crucible       wet_weight      sample_wet   
##  Length:52          Min.   :8.079   Min.   :10.13   Min.   :1.538  
##  Class :character   1st Qu.:8.566   1st Qu.:11.00   1st Qu.:2.464  
##  Mode  :character   Median :9.006   Median :11.71   Median :2.635  
##                     Mean   :8.994   Mean   :11.55   Mean   :2.559  
##                     3rd Qu.:9.435   3rd Qu.:12.05   3rd Qu.:2.824  
##                     Max.   :9.925   Max.   :12.68   Max.   :3.043  
##    dry_weight     after_550_C     after_950_C      context         
##  Min.   :10.11   Min.   :10.08   Min.   :10.05   Length:52         
##  1st Qu.:10.97   1st Qu.:10.94   1st Qu.:10.86   Class :character  
##  Median :11.67   Median :11.62   Median :11.53   Mode  :character  
##  Mean   :11.52   Mean   :11.47   Mean   :11.41                     
##  3rd Qu.:12.00   3rd Qu.:11.96   3rd Qu.:11.93                     
##  Max.   :12.65   Max.   :12.60   Max.   :12.52                     
##      type          
##  Length:52         
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
summary(tga)
##      Name           Analysis;Date         Batch               Wet           
##  Length:53          Length:53          Length:53          Length:53         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      Dry              Mass_550           Mass_950        
##  Length:53          Length:53          Length:53         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))

# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)

# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name

# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)

loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100

# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100

Manual LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(loi$c950)
## $stats
## [1] 0.398150 1.528018 2.198513 3.162314 4.889189
## 
## $n
## [1] 52
## 
## $conf
## [1] 1.840428 2.556598
## 
## $out
## [1]  8.041755 11.960444
# The two high carbonate outliers are both AY-53 (two different measurements to make sure the first batch wasn't misleading)

# Removing duplicates from the next graph
rownames(loi)
##  [1] "AY-1"    "AY-2"    "AY-3"    "AY-4"    "AY-5"    "AY-6"    "AY-7"   
##  [8] "AY-8"    "AY-9"    "AY-10"   "AY-11"   "AY-12"   "AY-13"   "AY-14"  
## [15] "AY-15"   "AY-16"   "AY-17"   "AY-18"   "AY-19"   "AY-20"   "AY-21"  
## [22] "AY-22"   "AY-23"   "AY-24"   "AY-25"   "AY-26"   "AY-27"   "AY-28"  
## [29] "AY-29"   "AY-30"   "AY-31"   "AY-32"   "AY-33"   "AY-34"   "AY-35"  
## [36] "AY-36"   "AY-37"   "AY-38"   "AY-39"   "AY-40"   "AY-41"   "AY-42"  
## [43] "AY-50"   "AY-51"   "AY-52"   "AY-53"   "AY-54"   "AY-50_2" "AY-51_2"
## [50] "AY-52_2" "AY-53_2" "AY-54_2"
loi <- subset(loi[1:47, ])

ggplot(loi, 
      aes(c550, c950, label = rownames(loi), colour = factor(context))) +
      geom_point(size=2, aes(shape = factor(type))) +
      geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

ggplot(loi, 
      aes(c550, c950)) +
      geom_point(size=2, aes(shape = factor(type), colour = factor(context))) +
      labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

barplot(loi$c550)

barplot(loi$c950)

# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)

ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

TGA LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(tga$c550, tga$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(tga$c950)
## $stats
## [1] 0.7233976 1.5855926 2.3958197 3.4621816 5.4590326
## 
## $n
## [1] 50
## 
## $conf
## [1] 1.976504 2.815135
## 
## $out
## [1] 7.994718
# Removing duplicates from the next graph
#rownames(tga)
#loi <- subset(loi[1:47, ])

ggplot(tga, 
      aes(c550, c950, label = rownames(tga))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

barplot(tga$c550)

barplot(tga$c950)

# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)

ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Colorimetry

# Libraries
library(dplyr); 
library(ggplot2); 
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)

color1 <- read.table("data/color_AY.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
##        L               a               b        
##  Min.   :37.56   Min.   : 7.97   Min.   :16.81  
##  1st Qu.:42.13   1st Qu.: 9.72   1st Qu.:19.76  
##  Median :43.64   Median :10.38   Median :21.18  
##  Mean   :43.92   Mean   :10.49   Mean   :21.22  
##  3rd Qu.:45.91   3rd Qu.:11.29   3rd Qu.:22.34  
##  Max.   :48.24   Max.   :13.53   Max.   :25.03
ggplot(color, 
      aes(a, b, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + 
      labs(title = "Color (a*b)") +
      xlab("a D65") +
      ylab("b D65") 

ggplot(color1,
      aes(X, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "Luminance") +
      ylab("L* (D65)") +
      xlab("Sample number") 

ggplot(color,
      aes(a, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "A versus luminance") +
      ylab("L* (D65)") +
      xlab("a") 

ggplot(color,
      aes(b, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "B versus luminance") +
      ylab("L* (D65)") +
      xlab("b") 

# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L, 
          xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$a, color$b, color$L,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

scatter3D(color$L, color$a, color$b, 
          xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$L, color$a, color$b,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

Ashdod-Yam: Byzantine

Intro about the site

pXRF

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Grain size analysis

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Data preparation

## Particle size
# Read and filter data
particle <- read.xlsx("data/grain_AYB.xlsx", sep=";")

# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>% 
  filter(!grepl('Average', Sample)) %>% 
  filter(!grepl('test', Sample))

# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"

# Saving the clean data as "AY_grain2"
## write.xlsx(particle, file="data/granulometry/AY_grain2.xlsx")

# Manually added Type and Context columns to AY_grain2 data: "AY_grain3"
grain3 <- read.xlsx("data/granulometry/AY_grain3.xlsx", sep=";")

# Samples as rownames
rownames(grain3) <- grain3$Sample 

# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}

grain <- as.data.frame(apply(grain3[3:5], 2, normalize))

# Ternary plots

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
  labs(title="Clay silt sand") +
  theme_rgbw() +
  theme_nomask() +
  geom_point(size=0) +
  geom_text(aes(label=rownames(grain)), size=3)

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
  labs(title="Clay silt sand") +
  theme_nomask() +
  geom_mask() +
  geom_point(size=2) 

Loss on ignition

We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Preparing data

loi <- read.xlsx("data/loi_AYB.xlsx", sep=";")
tga <- read.xlsx("data/tga_AYB.xlsx", sep=";")

summary(loi)
##     sample             crucible       wet_weight      sample_wet   
##  Length:7           Min.   :8.382   Min.   :10.92   Min.   :2.257  
##  Class :character   1st Qu.:8.555   1st Qu.:11.39   1st Qu.:2.613  
##  Mode  :character   Median :8.666   Median :11.50   Median :2.880  
##                     Mean   :8.943   Mean   :11.68   Mean   :2.739  
##                     3rd Qu.:9.442   3rd Qu.:12.13   3rd Qu.:2.905  
##                     Max.   :9.560   Max.   :12.31   Max.   :3.002  
##    dry_weight     after_550_C     after_950_C   
##  Min.   :10.83   Min.   :10.62   Min.   :10.50  
##  1st Qu.:11.33   1st Qu.:11.25   1st Qu.:11.04  
##  Median :11.45   Median :11.39   Median :11.21  
##  Mean   :11.63   Mean   :11.55   Mean   :11.24  
##  3rd Qu.:12.11   3rd Qu.:12.05   3rd Qu.:11.43  
##  Max.   :12.26   Max.   :12.24   Max.   :12.06
summary(tga)
##      Name           Analysis;Date      Batch               Wet           
##  Length:7           Min.   :44536   Length:7           Length:7          
##  Class :character   1st Qu.:44536   Class :character   Class :character  
##  Mode  :character   Median :44536   Mode  :character   Mode  :character  
##                     Mean   :44536                                        
##                     3rd Qu.:44536                                        
##                     Max.   :44536                                        
##      Dry              Mass_550           Mass_950        
##  Length:7           Length:7           Length:7          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))

# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)

# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name

# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)

loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100

# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100

Manual LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(loi$c950)
## $stats
## [1]  1.087679  5.239954  6.463715 14.575647 15.544905
## 
## $n
## [1] 7
## 
## $conf
## [1]  0.8885902 12.0388404
## 
## $out
## [1] 32.40429
# The two high carbonate outliers are both AY-53 (two different measurements to make sure the first batch wasn't misleading)

# Removing duplicates from the next graph
rownames(loi)
## [1] "AY-43" "AY-44" "AY-45" "AY-46" "AY-47" "AY-48" "AY-49"
loi <- subset(loi[1:47, ])

### ggplots later

barplot(loi$c550)

barplot(loi$c950)

# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)

ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

TGA LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(tga$c550, tga$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(tga$c950)
## $stats
## [1]  6.813924  8.278065  9.781651 11.887838 12.043832
## 
## $n
## [1] 7
## 
## $conf
## [1]  7.625953 11.937349
## 
## $out
## [1] 29.72412
# Removing duplicates from the next graph
#rownames(tga)
#loi <- subset(loi[1:47, ])

ggplot(tga, 
      aes(c550, c950, label = rownames(tga))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

barplot(tga$c550)

barplot(tga$c950)

# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)

ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Colorimetry

# Libraries
library(dplyr); 
library(ggplot2); 
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)

color1 <- read.table("data/color_AYB.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
##        L               a                b        
##  Min.   :34.56   Min.   : 2.480   Min.   : 6.98  
##  1st Qu.:38.93   1st Qu.: 5.215   1st Qu.:12.73  
##  Median :45.30   Median : 5.540   Median :13.28  
##  Mean   :46.69   Mean   : 6.211   Mean   :14.56  
##  3rd Qu.:50.95   3rd Qu.: 6.675   3rd Qu.:16.21  
##  Max.   :67.20   Max.   :11.680   Max.   :23.75
ggplot(color, 
      aes(a, b, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + 
      labs(title = "Color (a*b)") +
      xlab("a D65") +
      ylab("b D65") 

ggplot(color1,
      aes(X, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "Luminance") +
      ylab("L* (D65)") +
      xlab("Sample number") 

ggplot(color,
      aes(a, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "A versus luminance") +
      ylab("L* (D65)") +
      xlab("a") 

ggplot(color,
      aes(b, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "B versus luminance") +
      ylab("L* (D65)") +
      xlab("b") 

# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L, 
          xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$a, color$b, color$L,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

scatter3D(color$L, color$a, color$b, 
          xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$L, color$a, color$b,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

Israel

Intro about the site

pXRF

pXRF

Introduction

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

# Read data
nn <- read.xlsx("data/pxrf_israel.xlsx", sep=";")
nn <- as.data.frame(nn)

Grain size analysis

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Data preparation

# Read and filter data
particle <- read.xlsx("data/grain_israel.xlsx", sep=";")

# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>% 
  filter(!grepl('Average', Sample)) %>% 
  filter(!grepl('test', Sample))

# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"

# Saving the clean data as "israel_grain2"
## write.xlsx(particle, file="data/granulometry/israel_grain2.xlsx")

# Manually added Type and Context columns to israel_grain2 data: "israel_grain3"
grain3 <- read.xlsx("data/granulometry/israel_grain3.xlsx", sep=";")

# Samples as rownames
rownames(grain3) <- grain3$Sample 

# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}

grain <- as.data.frame(apply(grain3[3:5], 2, normalize))

# Ternary plots

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
  labs(title="Clay silt sand") +
  theme_rgbw() +
  theme_nomask() +
  geom_point(size=0) +
  geom_text(aes(label=rownames(grain)), size=3)

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
  labs(title="Clay silt sand") +
  theme_nomask() +
  geom_mask() +
  geom_point(size=2)

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
  labs(title="Clay silt sand") +
  theme_nomask() +
  geom_mask() +
  geom_point(size=2) 

Loss on ignition

We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Preparing data

loi <- read.xlsx("data/loi_israel.xlsx", sep=";")
tga <- read.xlsx("data/tga_israel.xlsx", sep=";")

summary(loi)
##     sample             crucible       wet_weight      sample_wet   
##  Length:78          Min.   :7.921   Min.   :10.13   Min.   :1.538  
##  Class :character   1st Qu.:8.566   1st Qu.:11.18   1st Qu.:2.448  
##  Mode  :character   Median :9.006   Median :11.71   Median :2.635  
##                     Mean   :8.995   Mean   :11.58   Mean   :2.588  
##                     3rd Qu.:9.449   3rd Qu.:12.05   3rd Qu.:2.845  
##                     Max.   :9.925   Max.   :12.68   Max.   :3.231  
##    dry_weight     after_550_C     after_950_C    
##  Min.   :10.11   Min.   :10.08   Min.   : 9.773  
##  1st Qu.:11.17   1st Qu.:11.12   1st Qu.:10.820  
##  Median :11.67   Median :11.62   Median :11.302  
##  Mean   :11.55   Mean   :11.49   Mean   :11.267  
##  3rd Qu.:12.00   3rd Qu.:11.95   3rd Qu.:11.781  
##  Max.   :12.65   Max.   :12.60   Max.   :12.524
summary(tga)
##      Name           Analysis;Date         Batch               Wet           
##  Length:72          Length:72          Length:72          Length:72         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      Dry              Mass_550           Mass_950        
##  Length:72          Length:72          Length:72         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))

# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)

# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name

# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)

loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100

# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100

Manual LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(loi$c950)
## $stats
## [1]  0.398150  1.912342  3.049426  9.981413 19.321298
## 
## $n
## [1] 78
## 
## $conf
## [1] 1.605872 4.492981
## 
## $out
##  [1] 29.25001 40.50466 33.45907 42.03850 42.60863 29.14914 29.25505 27.79538
##  [9] 33.34575 40.99217 26.13474 41.29933 32.40429
# The two high carbonate outliers are both AY-53 (two different measurements to make sure the first batch wasn't misleading)

# Removing duplicates from the next graph
rownames(loi)
##  [1] "AH-1"    "AH-2"    "AH-3"    "AH-4"    "AH-5"    "AH-6"    "AH-7"   
##  [8] "AH-8"    "AH-9"    "TI-1"    "TI-2"    "TI-3"    "TI-4"    "TI-5"   
## [15] "TI-6"    "TI-7"    "TI-8"    "TI-10"   "RLZ-1"   "AY-43"   "AY-44"  
## [22] "AY-45"   "AY-46"   "AY-47"   "AY-48"   "AY-49"   "AY-1"    "AY-2"   
## [29] "AY-3"    "AY-4"    "AY-5"    "AY-6"    "AY-7"    "AY-8"    "AY-9"   
## [36] "AY-10"   "AY-11"   "AY-12"   "AY-13"   "AY-14"   "AY-15"   "AY-16"  
## [43] "AY-17"   "AY-18"   "AY-19"   "AY-20"   "AY-21"   "AY-22"   "AY-23"  
## [50] "AY-24"   "AY-25"   "AY-26"   "AY-27"   "AY-28"   "AY-29"   "AY-30"  
## [57] "AY-31"   "AY-32"   "AY-33"   "AY-34"   "AY-35"   "AY-36"   "AY-37"  
## [64] "AY-38"   "AY-39"   "AY-40"   "AY-41"   "AY-42"   "AY-50"   "AY-51"  
## [71] "AY-52"   "AY-53"   "AY-54"   "AY-50_2" "AY-51_2" "AY-52_2" "AY-53_2"
## [78] "AY-54_2"
loi <- subset(loi[1:72, ])

### add ggplots later with context and type

barplot(loi$c550)

barplot(loi$c950)

# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)

ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

TGA LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(tga$c550, tga$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(tga$c950)
## $stats
## [1] 0.7233976 1.6976633 3.2631063 5.4590326 9.7816508
## 
## $n
## [1] 69
## 
## $conf
## [1] 2.547658 3.978555
## 
## $out
##  [1] 12.04383 11.73184 29.72412 42.03111 42.69172 30.94577 30.11137 28.19411
##  [9] 34.00000 41.45557 29.66118 41.94993
# Removing duplicates from the next graph
#rownames(tga)
#loi <- subset(loi[1:47, ])

ggplot(tga, 
      aes(c550, c950, label = rownames(tga))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

barplot(tga$c550)

barplot(tga$c950)

# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)

ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Colorimetry

# Libraries
library(dplyr); 
library(ggplot2); 
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)

color1 <- read.table("data/color_israel.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
##        L               a                b        
##  Min.   :43.75   Min.   : 3.160   Min.   :13.85  
##  1st Qu.:56.08   1st Qu.: 3.770   1st Qu.:14.71  
##  Median :64.30   Median : 4.350   Median :15.85  
##  Mean   :63.19   Mean   : 6.649   Mean   :17.27  
##  3rd Qu.:71.51   3rd Qu.: 6.280   3rd Qu.:17.70  
##  Max.   :75.34   Max.   :19.530   Max.   :25.84
ggplot(color, 
      aes(a, b, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + 
      labs(title = "Color (a*b)") +
      xlab("a D65") +
      ylab("b D65") 

ggplot(color1,
      aes(X, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "Luminance") +
      ylab("L* (D65)") +
      xlab("Sample number") 

ggplot(color,
      aes(a, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "A versus luminance") +
      ylab("L* (D65)") +
      xlab("a") 

ggplot(color,
      aes(b, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "B versus luminance") +
      ylab("L* (D65)") +
      xlab("b") 

# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L, 
          xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$a, color$b, color$L,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

scatter3D(color$L, color$a, color$b, 
          xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$L, color$a, color$b,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

Palaepaphos

Intro about the site

pXRF

Introduction

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggfortify); 
library(pca3d);
library(FactoMineR); # fast PCA graphs
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(tidyverse) # rectangles around HClust

Data preparation

# Read data
pxrf <- read.xlsx("data/pxrf_PP.xlsx", sep=";")

# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))

# Turn any "LOD":s and missing values to "NA":s to allow scaling
values <- c("MgO", "Al2O3", "SiO2", 
            "S", "Cl", "K2O", "CaO", "Ti", "V", "Cr", 
            "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As", 
            "Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", 
            "Rh", "Pd", "Ag", "Cd", "Sn", "Sb", 
            "La", "Ce", "Hf", "Ta", "W", "Pt", "Au", 
            "Hg", "Tl", "Pb", "Bi", "Th", "U")

pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)

# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(pxrf_NA, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)

# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"

# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)

# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)

# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)

# Datasets with all the samples, including soil and old mudbrick samples
summary(pxrf_no_na)
##    Area          Type          Ti               Mn                Fe        
##  new :11   mudbrick:45   Min.   :0.0969   Min.   :0.01190   Min.   :0.9617  
##  old :34   soil    : 8   1st Qu.:0.1814   1st Qu.:0.03893   1st Qu.:2.2241  
##  soil: 8                 Median :0.2118   Median :0.04670   Median :2.7214  
##                          Mean   :0.2180   Mean   :0.04549   Mean   :2.8091  
##                          3rd Qu.:0.2594   3rd Qu.:0.05440   3rd Qu.:3.3530  
##                          Max.   :0.3571   Max.   :0.06800   Max.   :4.7754  
##        Ni                 Cu                 Zr          
##  Min.   :0.000400   Min.   :0.003600   Min.   :0.004300  
##  1st Qu.:0.002200   1st Qu.:0.005933   1st Qu.:0.007800  
##  Median :0.003000   Median :0.006500   Median :0.008800  
##  Mean   :0.002856   Mean   :0.006761   Mean   :0.008810  
##  3rd Qu.:0.003500   3rd Qu.:0.007400   3rd Qu.:0.009667  
##  Max.   :0.005900   Max.   :0.011333   Max.   :0.012200
summary(pxrf_all)
##    Area          Type         MgO            Al2O3            SiO2       
##  new :11   mudbrick:45   Min.   :2.919   Min.   :2.182   Min.   : 9.164  
##  old :34   soil    : 8   1st Qu.:3.369   1st Qu.:3.403   1st Qu.:14.598  
##  soil: 8                 Median :4.088   Median :4.317   Median :17.087  
##                          Mean   :4.011   Mean   :4.168   Mean   :16.502  
##                          3rd Qu.:4.499   3rd Qu.:4.702   3rd Qu.:18.560  
##                          Max.   :5.125   Max.   :5.902   Max.   :22.259  
##                          NA's   :34      NA's   :34      NA's   :34      
##        S                 Cl               K2O              CaO       
##  Min.   :0.04373   Min.   :0.01113   Min.   :0.1980   Min.   :11.28  
##  1st Qu.:0.10686   1st Qu.:0.02597   1st Qu.:0.3599   1st Qu.:14.53  
##  Median :0.21532   Median :0.05117   Median :0.4238   Median :18.05  
##  Mean   :0.25538   Mean   :0.13135   Mean   :0.4340   Mean   :17.94  
##  3rd Qu.:0.32239   3rd Qu.:0.10892   3rd Qu.:0.4795   3rd Qu.:20.82  
##  Max.   :0.74597   Max.   :0.93440   Max.   :0.6584   Max.   :25.59  
##  NA's   :35        NA's   :34        NA's   :36       NA's   :34     
##        Ti               V                 Mn                Fe        
##  Min.   :0.0969   Min.   :0.00267   Min.   :0.01190   Min.   :0.9617  
##  1st Qu.:0.1814   1st Qu.:0.00334   1st Qu.:0.03893   1st Qu.:2.2241  
##  Median :0.2118   Median :0.00420   Median :0.04670   Median :2.7214  
##  Mean   :0.2180   Mean   :0.00425   Mean   :0.04549   Mean   :2.8091  
##  3rd Qu.:0.2594   3rd Qu.:0.00512   3rd Qu.:0.05440   3rd Qu.:3.3530  
##  Max.   :0.3571   Max.   :0.00587   Max.   :0.06800   Max.   :4.7754  
##                   NA's   :41                                          
##        Ni                 Cu                 Zn                As         
##  Min.   :0.000400   Min.   :0.003600   Min.   :0.00167   Min.   :0.00033  
##  1st Qu.:0.002200   1st Qu.:0.005933   1st Qu.:0.00308   1st Qu.:0.00033  
##  Median :0.003000   Median :0.006500   Median :0.00337   Median :0.00040  
##  Mean   :0.002856   Mean   :0.006761   Mean   :0.00335   Mean   :0.00040  
##  3rd Qu.:0.003500   3rd Qu.:0.007400   3rd Qu.:0.00382   3rd Qu.:0.00047  
##  Max.   :0.005900   Max.   :0.011333   Max.   :0.00500   Max.   :0.00047  
##                                        NA's   :34        NA's   :48       
##        Rb                Sr                Y                 Zr          
##  Min.   :0.00120   Min.   :0.01790   Min.   :0.00087   Min.   :0.004300  
##  1st Qu.:0.00173   1st Qu.:0.02893   1st Qu.:0.00137   1st Qu.:0.007800  
##  Median :0.00203   Median :0.03403   Median :0.00160   Median :0.008800  
##  Mean   :0.00204   Mean   :0.03402   Mean   :0.00161   Mean   :0.008810  
##  3rd Qu.:0.00213   3rd Qu.:0.03835   3rd Qu.:0.00173   3rd Qu.:0.009667  
##  Max.   :0.00357   Max.   :0.04840   Max.   :0.00260   Max.   :0.012200  
##  NA's   :35        NA's   :34        NA's   :36                          
##        Nb                Rh                Ag                Pb       
##  Min.   :0.00063   Min.   :0.01570   Min.   :0.00230   Min.   :0.002  
##  1st Qu.:0.00063   1st Qu.:0.01683   1st Qu.:0.00282   1st Qu.:0.002  
##  Median :0.00063   Median :0.01797   Median :0.00322   Median :0.002  
##  Mean   :0.00063   Mean   :0.01797   Mean   :0.00330   Mean   :0.002  
##  3rd Qu.:0.00063   3rd Qu.:0.01910   3rd Qu.:0.00369   3rd Qu.:0.002  
##  Max.   :0.00063   Max.   :0.02023   Max.   :0.00447   Max.   :0.002  
##  NA's   :52        NA's   :51        NA's   :49        NA's   :52
# Datasets with only our new mudbrick samples (no soil or previously published ones)
MB_nona <- pxrf_no_na[c(1:11), ]
MB_all <- pxrf_all[c(1:11), ]

# Removing As, Nb, Rh, Ag and Pb as they have almost no viable measurement values
MB_all <- MB_all %>% 
        select(-As) %>% 
        select(-Nb) %>% 
        select(-Rh) %>%   
        select(-Ag) %>% 
        select(-Pb) 

summary(MB_nona)
##    Area          Type          Ti               Mn                Fe        
##  new :11   mudbrick:11   Min.   :0.0969   Min.   :0.01190   Min.   :0.9617  
##  old : 0   soil    : 0   1st Qu.:0.1663   1st Qu.:0.02397   1st Qu.:1.6270  
##  soil: 0                 Median :0.1915   Median :0.03347   Median :2.1312  
##                          Mean   :0.1780   Mean   :0.03070   Mean   :1.8912  
##                          3rd Qu.:0.1941   3rd Qu.:0.03895   3rd Qu.:2.2622  
##                          Max.   :0.2295   Max.   :0.04177   Max.   :2.4481  
##        Ni                 Cu                 Zr          
##  Min.   :0.001400   Min.   :0.004500   Min.   :0.005400  
##  1st Qu.:0.002317   1st Qu.:0.005533   1st Qu.:0.007767  
##  Median :0.003000   Median :0.006700   Median :0.008733  
##  Mean   :0.002812   Mean   :0.007333   Mean   :0.008494  
##  3rd Qu.:0.003367   3rd Qu.:0.008717   3rd Qu.:0.009100  
##  Max.   :0.003967   Max.   :0.011333   Max.   :0.011833
summary(MB_all)
##    Area          Type         MgO            Al2O3            SiO2       
##  new :11   mudbrick:11   Min.   :2.919   Min.   :2.182   Min.   : 9.164  
##  old : 0   soil    : 0   1st Qu.:3.531   1st Qu.:3.403   1st Qu.:13.366  
##  soil: 0                 Median :4.088   Median :4.317   Median :17.705  
##                          Mean   :3.999   Mean   :3.941   Mean   :15.874  
##                          3rd Qu.:4.400   3rd Qu.:4.593   3rd Qu.:18.560  
##                          Max.   :5.125   Max.   :5.270   Max.   :20.334  
##                                                                          
##        S                Cl               K2O              CaO       
##  Min.   :0.1622   Min.   :0.02723   Min.   :0.1980   Min.   :11.28  
##  1st Qu.:0.2737   1st Qu.:0.06783   1st Qu.:0.3920   1st Qu.:18.46  
##  Median :0.3221   Median :0.09397   Median :0.4376   Median :20.74  
##  Mean   :0.3550   Mean   :0.20759   Mean   :0.4312   Mean   :19.67  
##  3rd Qu.:0.3809   3rd Qu.:0.23782   3rd Qu.:0.4757   3rd Qu.:22.12  
##  Max.   :0.7460   Max.   :0.93440   Max.   :0.6052   Max.   :25.59  
##                                     NA's   :1                       
##        Ti               V                  Mn                Fe        
##  Min.   :0.0969   Min.   :0.002667   Min.   :0.01190   Min.   :0.9617  
##  1st Qu.:0.1663   1st Qu.:0.004317   1st Qu.:0.02397   1st Qu.:1.6270  
##  Median :0.1915   Median :0.004750   Median :0.03347   Median :2.1312  
##  Mean   :0.1780   Mean   :0.004721   Mean   :0.03070   Mean   :1.8912  
##  3rd Qu.:0.1941   3rd Qu.:0.005633   3rd Qu.:0.03895   3rd Qu.:2.2622  
##  Max.   :0.2295   Max.   :0.005867   Max.   :0.04177   Max.   :2.4481  
##                   NA's   :3                                            
##        Ni                 Cu                 Zn                 Rb          
##  Min.   :0.001400   Min.   :0.004500   Min.   :0.001667   Min.   :0.001200  
##  1st Qu.:0.002317   1st Qu.:0.005533   1st Qu.:0.002467   1st Qu.:0.001617  
##  Median :0.003000   Median :0.006700   Median :0.003233   Median :0.001767  
##  Mean   :0.002812   Mean   :0.007333   Mean   :0.003170   Mean   :0.001750  
##  3rd Qu.:0.003367   3rd Qu.:0.008717   3rd Qu.:0.003800   3rd Qu.:0.001925  
##  Max.   :0.003967   Max.   :0.011333   Max.   :0.005000   Max.   :0.002133  
##                                                           NA's   :1         
##        Sr                Y                   Zr          
##  Min.   :0.01790   Min.   :0.0008667   Min.   :0.005400  
##  1st Qu.:0.02893   1st Qu.:0.0013333   1st Qu.:0.007767  
##  Median :0.03507   Median :0.0014000   Median :0.008733  
##  Mean   :0.03438   Mean   :0.0013741   Mean   :0.008494  
##  3rd Qu.:0.03835   3rd Qu.:0.0014667   3rd Qu.:0.009100  
##  Max.   :0.04840   Max.   :0.0017333   Max.   :0.011833  
##                    NA's   :2

Elemental correlations in new mudbrick samples

# Visualize correlations between elements
cor(MB_all[3:20]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")

K-means cluster analysis

# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(MB_nona[3:8], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')

# K-means clustering in ggpairs: 2 clusters
km <-kmeans(MB_nona[3:8], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(MB_nona[3:8], mapping = aes(col = cluster_x))

pXRF: PCA

PCA with only the no-NA:s elements

# PCA for only non-NA elements
colnames(MB_nona)
## [1] "Area" "Type" "Ti"   "Mn"   "Fe"   "Ni"   "Cu"   "Zr"
pca_1 <- prcomp(MB_nona[3:8], scale = TRUE, center = TRUE)
summary(pca_1)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.0136 1.0789 0.70395 0.45806 0.25323 0.10779
## Proportion of Variance 0.6758 0.1940 0.08259 0.03497 0.01069 0.00194
## Cumulative Proportion  0.6758 0.8698 0.95241 0.98738 0.99806 1.00000
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))

autoplot(pca_1, data=MB_nona, colour="red", shape = FALSE, label = TRUE,  main = "PCA Palaepaphos 1: no NA:s at all")

PCA(MB_nona[3:8])

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 11 individuals, described by 6 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

PCA with all the feasible elements

# NA:s converted to zeros
MB_all[is.na(MB_all)] <- 0

colnames(MB_all)
##  [1] "Area"  "Type"  "MgO"   "Al2O3" "SiO2"  "S"     "Cl"    "K2O"   "CaO"  
## [10] "Ti"    "V"     "Mn"    "Fe"    "Ni"    "Cu"    "Zn"    "Rb"    "Sr"   
## [19] "Y"     "Zr"
pca_2 <- prcomp((MB_all[3:20]), scale = TRUE, center = TRUE)
summary(pca_2)
## Importance of components:
##                           PC1    PC2     PC3     PC4    PC5     PC6    PC7
## Standard deviation     3.1620 1.8251 1.26077 1.10902 0.8950 0.63845 0.5612
## Proportion of Variance 0.5554 0.1851 0.08831 0.06833 0.0445 0.02265 0.0175
## Cumulative Proportion  0.5554 0.7405 0.82882 0.89715 0.9416 0.96429 0.9818
##                           PC8     PC9    PC10      PC11
## Standard deviation     0.4388 0.34174 0.13583 2.812e-16
## Proportion of Variance 0.0107 0.00649 0.00103 0.000e+00
## Cumulative Proportion  0.9925 0.99897 1.00000 1.000e+00
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))

autoplot(pca_2, data=MB_all, colour="red", shape = FALSE, label = TRUE,  main = "PCA Palaepaphos 2: NA:s permitted")

PCA(MB_all[3:20])

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 11 individuals, described by 18 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

PCA with all the Mudbrick and soil samples (including the previously published ones): No NA values permitted

# PCA for only non-NA elements
colnames(pxrf_no_na)
## [1] "Area" "Type" "Ti"   "Mn"   "Fe"   "Ni"   "Cu"   "Zr"
pca_3 <- prcomp(na.omit(pxrf_no_na[3:6]), scale = TRUE, center = TRUE)
summary(pca_3)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.6514 0.9995 0.41614 0.31724
## Proportion of Variance 0.6818 0.2498 0.04329 0.02516
## Cumulative Proportion  0.6818 0.9315 0.97484 1.00000
biplot(pca_3, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))

autoplot(pca_3, data=pxrf_no_na, colour='Area', shape = FALSE, label = TRUE,  main = "PCA Palaepaphos grouped by area")

autoplot(pca_3, data=pxrf_no_na, colour='Type', shape = FALSE, label = TRUE,  main = "PCA Palaepaphos grouped by sample type")

PCA with all the Mudbrick samples

(including the previously published ones): NA values are permitted This is not feasible as the difference in what values are available is too different between old and new samples.

# NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0

colnames(pxrf_all)
##  [1] "Area"  "Type"  "MgO"   "Al2O3" "SiO2"  "S"     "Cl"    "K2O"   "CaO"  
## [10] "Ti"    "V"     "Mn"    "Fe"    "Ni"    "Cu"    "Zn"    "As"    "Rb"   
## [19] "Sr"    "Y"     "Zr"    "Nb"    "Rh"    "Ag"    "Pb"
pca_4 <- prcomp(pxrf_all[3:25], scale = TRUE, center = TRUE)
summary(pca_4)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.3748 1.8765 1.50629 1.13013 1.04455 0.89816 0.83735
## Proportion of Variance 0.4952 0.1531 0.09865 0.05553 0.04744 0.03507 0.03049
## Cumulative Proportion  0.4952 0.6483 0.74692 0.80245 0.84989 0.88497 0.91545
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.77794 0.66209 0.53427 0.40495 0.35837 0.33354 0.27679
## Proportion of Variance 0.02631 0.01906 0.01241 0.00713 0.00558 0.00484 0.00333
## Cumulative Proportion  0.94176 0.96082 0.97323 0.98036 0.98595 0.99078 0.99412
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.22788 0.17584 0.14605 0.12032 0.10296 0.06388 0.04325
## Proportion of Variance 0.00226 0.00134 0.00093 0.00063 0.00046 0.00018 0.00008
## Cumulative Proportion  0.99637 0.99772 0.99865 0.99927 0.99974 0.99991 0.99999
##                           PC22      PC23
## Standard deviation     0.01157 2.376e-16
## Proportion of Variance 0.00001 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00
biplot(pca_4, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))

autoplot(pca_4, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE,  main = "PCA Palaepaphos grouped by area")

autoplot(pca_4, data=pxrf_all, colour='Type', shape = FALSE, label = TRUE,  main = "PCA Palaepaphos grouped by sample type")

HCA analysis

HCA including only new mudbrick samples:

# HCA dendrogam 1
dend <- 
    MB_all[3:20] %>%           # data
    dist %>%                            # calculate a distance matrix
    hclust(method = "ward.D2") %>%      # hierarchical clustering 
    as.dendrogram %>%                   # turn the object into a dendrogram
    set("branches_k_color", k=5) %>% 
    set("branches_lwd", 1.2) %>%
    set("labels_col", c("red", "black", "blue"), k=5) %>%    
    set("labels_cex", 1) %>% 
    set("leaves_pch", NA) 
# plot the dend in usual "base" plotting engine:
plot(dend)

# HCA dendrogam 2
HCA.ward5 <- hclust(dist(MB_all[3:20], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)

# Divisive hierarchical clustering (DIANA) just for comparison
hc4 <- diana(MB_all[3:20])
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")

# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.7894269

HCA analysis

HCA including all mudbrick samples, only no NA:s elements:

# HCA dendrogam 1
dend <- 
    pxrf_no_na[1:45, 3:8] %>%           # data
    dist %>%                            # calculate a distance matrix
    hclust(method = "ward.D2") %>%      # hierarchical clustering 
    as.dendrogram %>%                   # turn the object into a dendrogram
    set("branches_k_color", k=5) %>% 
    set("branches_lwd", 1.2) %>%
    set("labels_col", c("red", "black", "blue"), k=5) %>%    
    set("labels_cex", 1) %>% 
    set("leaves_pch", NA) 
# plot the dend in usual "base" plotting engine:
plot(dend)

# HCA dendrogam 2
HCA.ward5 <- hclust(dist(pxrf_no_na[1:45, 3:8], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)

# Divisive hierarchical clustering (DIANA) just for comparison
hc4 <- diana(pxrf_no_na[3:8])
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")

# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.9804178

Grain size analysis

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

# Read and filter data
particle <- read.xlsx("data/grain_PP.xlsx", sep=";")

# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>% 
  filter(!grepl('Average', Sample)) %>% 
  filter(!grepl('test', Sample))

# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"

# Saving the clean data as "PP_grain2"
## write.xlsx(particle, file="data/granulometry/PP_grain2.xlsx")

# Manually added Type and Context columns to PP_grain2 data: "PP_grain3"
grain3 <- read.xlsx("data/granulometry/PP_grain3.xlsx", sep=";")

# Samples as rownames
rownames(grain3) <- grain3$Sample 

# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}

grain <- as.data.frame(apply(grain3[3:5], 2, normalize))

# Ternary plots

MB_grain <- grain[c(9:19),]
MB_context <- grain3[c(9:19),]

ggtern(data=MB_grain, aes(x=Clay, y=Sand, z=Silt, color=MB_context$Locus)) +
  labs(title="Clay silt sand") +
  theme_rgbw() +
  theme_nomask() +
  geom_point(size=0) +
  geom_text(aes(label=rownames(MB_grain)), size=3)

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
  labs(title="Clay silt sand") +
  theme_nomask() +
  geom_mask() +
  geom_point(size=2) 

Loss on ignition

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Loss on ignition

Preparing data

loi <- read.xlsx("data/loi_PP.xlsx", sep=";")
tga <- read.xlsx("data/tga_PP.xlsx", sep=";")

summary(loi)
##      type              sample             crucible       wet;weight    
##  Length:27          Length:27          Min.   :8.100   Min.   : 9.637  
##  Class :character   Class :character   1st Qu.:8.639   1st Qu.:10.859  
##  Mode  :character   Mode  :character   Median :8.997   Median :11.184  
##                                        Mean   :8.988   Mean   :11.274  
##                                        3rd Qu.:9.454   3rd Qu.:11.844  
##                                        Max.   :9.560   Max.   :12.301  
##    sample;wet      dry_weight      after_550_C      after_950_C   
##  Min.   :1.537   Min.   : 9.595   Min.   : 9.484   Min.   : 9.23  
##  1st Qu.:2.193   1st Qu.:10.813   1st Qu.:10.721   1st Qu.:10.27  
##  Median :2.371   Median :11.132   Median :11.019   Median :10.59  
##  Mean   :2.286   Mean   :11.216   Mean   :11.113   Mean   :10.67  
##  3rd Qu.:2.453   3rd Qu.:11.777   3rd Qu.:11.681   3rd Qu.:11.18  
##  Max.   :2.785   Max.   :12.218   Max.   :12.076   Max.   :11.56
summary(tga)
##      Name           Analysis;Date         Batch               Wet           
##  Length:30          Length:30          Length:30          Length:30         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      Dry              Mass_550           Mass_950        
##  Length:30          Length:30          Length:30         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))

# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)

# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name

# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)

loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100

# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100

Manual LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(loi$c950)
## $stats
## [1] 14.55842 18.28992 20.42575 21.68496 25.15246
## 
## $n
## [1] 27
## 
## $conf
## [1] 19.39341 21.45808
## 
## $out
## [1] 33.21076 33.78112
# The two high carbonate outliers are both AY-53 (two different measurements to make sure the first batch wasn't misleading)

# Removing duplicates from the next graph
rownames(loi)
##  [1] "PP-1"   "PP-2"   "PP-3"   "PP-4"   "PP-5"   "PP-6"   "PP-7"   "PP-8"  
##  [9] "PP-9"   "PP-10"  "PP-11"  "PP-12"  "PP-13"  "PP-14"  "PP-15"  "PP-16" 
## [17] "PP-17"  "PP-18"  "PP-19"  "PP-1_2" "PP-2_2" "PP-3_2" "PP-4_2" "PP-5_2"
## [25] "PP-6_2" "PP-7_2" "PP-8_2"
loi <- subset(loi[1:19, ])

ggplot(loi, 
      aes(c550, c950, label = rownames(loi), colour = factor(type))) +
      geom_point(size=2, aes(shape = factor(type))) +
      geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

barplot(loi$c550)

barplot(loi$c950)

# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)

ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

TGA LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(tga$c550, tga$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(tga$c950)
## $stats
## [1] 14.54839 18.93682 20.82648 22.49982 24.52314
## 
## $n
## [1] 28
## 
## $conf
## [1] 19.76259 21.89036
## 
## $out
## [1] 31.05334 33.79251
# Removing duplicates from the next graph
#rownames(tga)
#loi <- subset(loi[1:47, ])

ggplot(tga, 
      aes(c550, c950, label = rownames(tga))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

barplot(tga$c550)

barplot(tga$c950)

# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)

ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Colorimetry

# Libraries
library(dplyr); 
library(ggplot2); 
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)

color1 <- read.table("data/color_PP.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
##        L               a               b        
##  Min.   :62.38   Min.   : 7.27   Min.   :19.90  
##  1st Qu.:63.47   1st Qu.: 8.50   1st Qu.:21.36  
##  Median :64.19   Median : 9.34   Median :21.72  
##  Mean   :65.50   Mean   : 9.29   Mean   :21.92  
##  3rd Qu.:65.78   3rd Qu.:10.56   3rd Qu.:22.96  
##  Max.   :72.00   Max.   :10.90   Max.   :23.90
ggplot(color, 
      aes(a, b, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + 
      labs(title = "Color") +
      xlab("a D65") +
      ylab("b D65") 

ggplot(color1,
      aes(X, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "Luminance") +
      ylab("L* (D65)") +
      xlab("Sample number") 

ggplot(color,
      aes(a, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "Luminance") +
      ylab("L* (D65)") +
      xlab("a") 

ggplot(color,
      aes(b, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "Luminance") +
      ylab("L* (D65)") +
      xlab("b") 

# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L, 
          xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$a, color$b, color$L,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

scatter3D(color$L, color$a, color$b, 
          xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$L, color$a, color$b,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

Artaxata

Intro about the site

pXRF

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

# Read the raw data
pxrf <- read.xlsx("data/pxrf_AA.xlsx", sep=";")

# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))

# Turn any "LOD":s and missing values to "NA":s to allow scaling
# Removing "Na", S", "Cl", "P" and "Ba" already
values <- c("MgO", "Al2O3", "SiO2", 
            "K2O", "CaO", "Ti", "V", "Cr", 
            "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As", 
            "Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", 
            "Rh", "Pd", "Ag", "Cd", "Sn", "Sb", 
            "La", "Ce", "Hf", "Ta", "W", "Pt", "Au", 
            "Hg", "Tl", "Pb", "Bi", "Th", "U")

pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)

# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))

# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)

# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"

# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)

# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)

# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)

pxrf_no_na:

glimpse(pxrf_no_na)
## Rows: 10
## Columns: 17
## $ Area  <fct> Hill II, Hill II, Hill II, Phase I, Phase I, Phase II, Phase II,~
## $ Type  <fct> mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudb~
## $ MgO   <dbl> -0.1594010, 0.1129059, -0.4510038, -0.3887203, 0.4369466, 1.9131~
## $ Al2O3 <dbl> 0.6990613, 0.2737338, -0.1611149, -0.2592941, 0.4246452, 1.38943~
## $ SiO2  <dbl> 0.85557445, 0.61604918, -0.12790064, 0.09683497, 0.61414855, 0.7~
## $ K2O   <dbl> 0.60692024, 0.02816062, -0.75789490, 0.04880237, 1.11418034, 1.0~
## $ CaO   <dbl> -0.54131017, 1.20914863, 0.05989818, 0.23434386, 1.02571929, 0.9~
## $ Ti    <dbl> 0.8351083, 0.8389159, -0.2277469, -0.1097123, 0.8688325, 0.73502~
## $ Mn    <dbl> -0.16154103, 0.23307430, -0.07138547, -0.11572427, 0.55674754, 1~
## $ Fe    <dbl> 0.05533421, 0.13332917, -0.15388527, -0.15233743, 0.09919632, 1.~
## $ Ni    <dbl> 0.82749061, -0.04503350, -0.80497386, -0.94570355, -0.24205508, ~
## $ Cu    <dbl> 8.205941e-01, 2.188251e-01, -1.039419e+00, 4.811057e-16, 4.37650~
## $ Zn    <dbl> 0.41186175, -0.19830381, -0.88474007, -0.02033885, 0.33559106, 1~
## $ Rb    <dbl> 1.13127170, -0.15127470, -0.51301856, -1.20362047, -0.01973148, ~
## $ Sr    <dbl> -1.40588371, -0.28024838, 0.07368850, 0.63070392, 1.43141359, -0~
## $ Y     <dbl> 0.57088572, 0.04228783, -0.22201111, -0.27487090, -0.01057196, 1~
## $ Zr    <dbl> -0.4713101, -1.1158368, -0.5317345, -0.2094712, 1.3011383, 1.905~

pxrf_all:

glimpse(pxrf_all)
## Rows: 10
## Columns: 21
## $ Area  <fct> Hill II, Hill II, Hill II, Phase I, Phase I, Phase II, Phase II,~
## $ Type  <fct> mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudb~
## $ MgO   <dbl> -0.1594010, 0.1129059, -0.4510038, -0.3887203, 0.4369466, 1.9131~
## $ Al2O3 <dbl> 0.6990613, 0.2737338, -0.1611149, -0.2592941, 0.4246452, 1.38943~
## $ SiO2  <dbl> 0.85557445, 0.61604918, -0.12790064, 0.09683497, 0.61414855, 0.7~
## $ K2O   <dbl> 0.60692024, 0.02816062, -0.75789490, 0.04880237, 1.11418034, 1.0~
## $ CaO   <dbl> -0.54131017, 1.20914863, 0.05989818, 0.23434386, 1.02571929, 0.9~
## $ Ti    <dbl> 0.8351083, 0.8389159, -0.2277469, -0.1097123, 0.8688325, 0.73502~
## $ V     <dbl> 0.02741905, 0.56361382, NA, NA, -0.43565825, -0.19193335, -0.460~
## $ Cr    <dbl> NA, NA, NA, NA, -0.6861787, 0.2217828, NA, NA, NA, NA
## $ Mn    <dbl> -0.16154103, 0.23307430, -0.07138547, -0.11572427, 0.55674754, 1~
## $ Fe    <dbl> 0.05533421, 0.13332917, -0.15388527, -0.15233743, 0.09919632, 1.~
## $ Ni    <dbl> 0.82749061, -0.04503350, -0.80497386, -0.94570355, -0.24205508, ~
## $ Cu    <dbl> 8.205941e-01, 2.188251e-01, -1.039419e+00, 4.811057e-16, 4.37650~
## $ Zn    <dbl> 0.41186175, -0.19830381, -0.88474007, -0.02033885, 0.33559106, 1~
## $ As    <dbl> 1.10179138, -0.87461790, NA, -0.05111403, 0.27828751, 0.44298829~
## $ Rb    <dbl> 1.13127170, -0.15127470, -0.51301856, -1.20362047, -0.01973148, ~
## $ Sr    <dbl> -1.40588371, -0.28024838, 0.07368850, 0.63070392, 1.43141359, -0~
## $ Y     <dbl> 0.57088572, 0.04228783, -0.22201111, -0.27487090, -0.01057196, 1~
## $ Zr    <dbl> -0.4713101, -1.1158368, -0.5317345, -0.2094712, 1.3011383, 1.905~
## $ Nb    <dbl> 0.93500042, -0.65796326, NA, -0.06060188, -0.06060188, -0.657963~

pXRF: correlations

Introduction

# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)

# Visualize correlations between elements
cor(pxrf_1[3:15]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")

# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:12], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')

# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))

pXRF: PCA

PCA with only the no-NA:s elements

# PCA
pca_1 <- prcomp(pxrf_1[3:15])
summary(pca_1)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.4147 1.0889 0.9306 0.48309 0.37987 0.31662 0.24466
## Proportion of Variance 0.6901 0.1403 0.1025 0.02762 0.01708 0.01186 0.00708
## Cumulative Proportion  0.6901 0.8305 0.9330 0.96059 0.97767 0.98953 0.99662
##                            PC8     PC9      PC10
## Standard deviation     0.15630 0.06455 7.266e-17
## Proportion of Variance 0.00289 0.00049 0.000e+00
## Cumulative Proportion  0.99951 1.00000 1.000e+00
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA limited elements")

autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE,  main = "PCA grouped by area")

PCA with all the feasible elements

# PCA with all elements, NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0

pca_2 <- prcomp(pxrf_all[3:21])
summary(pca_2)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.6421 1.2026 0.98838 0.84185 0.45012 0.39002 0.30457
## Proportion of Variance 0.6529 0.1353 0.09136 0.06628 0.01895 0.01423 0.00868
## Cumulative Proportion  0.6529 0.7881 0.87946 0.94574 0.96469 0.97891 0.98759
##                            PC8     PC9      PC10
## Standard deviation     0.28180 0.23084 2.238e-16
## Proportion of Variance 0.00743 0.00498 0.000e+00
## Cumulative Proportion  0.99502 1.00000 1.000e+00
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA more elements")

autoplot(pca_2, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE,  main = "PCA more elements grouped by area")

HCA

# HCA

# HCA dendrogam 1
dend <- 
    pxrf_1[3:12] %>%                    # data
    dist %>%                            # calculate a distance matrix
    hclust(method = "ward.D2") %>%      # hierarchical clustering 
    as.dendrogram %>%                   # turn the object into a dendrogram
    set("branches_k_color", k=4) %>% 
    set("branches_lwd", 1.2) %>%
    set("labels_col", c("red", "black", "blue"), k=4) %>%    
    set("labels_cex", 1) %>% 
    set("leaves_pch", NA) 
# plot the dend in usual "base" plotting engine:
plot(dend)

# HCA dendrogam 2: 
HCA.ward4 <- hclust(dist(pxrf_1[3:12], method="euclid"), method="ward.D2")
plot(HCA.ward4, main="HCA with clusters")
rect.hclust(HCA.ward4, k=4)

Grain size analysis

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots


## Particle size
# Read and filter data
particle <- read.xlsx("data/grain_AA.xlsx", sep=";")

# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>% 
  filter(!grepl('Average', Sample)) %>% 
  filter(!grepl('test', Sample))

# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"

# Saving the clean data as "AY_grain2"
## write.xlsx(particle, file="data/granulometry/AA_grain2.xlsx")

# Manually added Type and Context columns to AA_grain2 data: "AA_grain3"
grain3 <- read.xlsx("data/granulometry/AA_grain3.xlsx", sep=";")

# Samples as rownames
rownames(grain3) <- grain3$Sample 

# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}

grain <- as.data.frame(apply(grain3[3:5], 2, normalize))

# Ternary plots

ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
  labs(title="Clay silt sand") +
  theme_rgbw() +
  theme_nomask() +
  geom_point(size=0) +
  geom_text(aes(label=rownames(grain)), size=3)

Loss on ignition

We did loss on ignition analysis both in the traditional manual method and with a modern TGA for comparison.

# Libraries
library(dplyr); 
library(GGally); 
library(corrplot); 
library(tidyr); 
library(openxlsx); 
library(cluster); 
library(ggplot2); 
library(ggrepel);
library(ggfortify); 
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots

Preparing data

loi <- read.xlsx("data/loi_AA.xlsx", sep=";")
tga <- read.xlsx("data/tga_AA.xlsx", sep=";")

summary(loi)
##     sample             crucible       wet_weight      sample_wet   
##  Length:10          Min.   :8.066   Min.   :10.70   Min.   :1.923  
##  Class :character   1st Qu.:8.461   1st Qu.:10.93   1st Qu.:2.326  
##  Mode  :character   Median :8.995   Median :11.48   Median :2.558  
##                     Mean   :8.892   Mean   :11.39   Mean   :2.496  
##                     3rd Qu.:9.174   3rd Qu.:11.75   3rd Qu.:2.701  
##                     Max.   :9.890   Max.   :12.18   Max.   :2.842  
##    dry_weight     after_550_C     after_950_C   
##  Min.   :10.58   Min.   :10.47   Min.   :10.31  
##  1st Qu.:10.85   1st Qu.:10.78   1st Qu.:10.60  
##  Median :11.43   Median :11.33   Median :11.16  
##  Mean   :11.32   Mean   :11.22   Mean   :11.07  
##  3rd Qu.:11.69   3rd Qu.:11.58   3rd Qu.:11.42  
##  Max.   :12.13   Max.   :12.06   Max.   :11.94
summary(tga)
##      Name           Analysis;Date         Batch               Wet           
##  Length:12          Length:12          Length:12          Length:12         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      Dry              Mass_550           Mass_950        
##  Length:12          Length:12          Length:12         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character
# Remove rows containing words 'kekkila' (internal standard)
tga <- filter(tga, !grepl('kekkila', Name))

# TGA character columns to numeric
tga[, c(4:7)] <- sapply(tga[, c(4:7)], as.numeric)

# Sample names as rownames
rownames(loi) <- loi$sample
rownames(tga) <- tga$Name

# Creating new columns for analysis by subtracting crucible mass;
# "dry" (sample mass after drying)
# "c550" (sample mass after 550 C)
# "c950" (sample mass after 950 C)
loi$dry <- (loi$dry_weight - loi$crucible)
loi$c550mass <- (loi$after_550_C - loi$crucible)
loi$c950mass <- (loi$after_950_C - loi$crucible)

loi$c550 <- (loi$dry - loi$c550mass)/(loi$dry) * 100
loi$c950 <- (loi$c550mass - loi$c950mass)/(loi$c550mass) * 100

# Same with TGA
tga$c550 <- (tga$Dry - tga$Mass_550)/(tga$Dry) * 100
tga$c950 <- (tga$Mass_550 - tga$Mass_950)/(tga$Mass_550) * 100

Manual LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(loi$c550, loi$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(loi$c950)
## $stats
## [1] 5.507166 5.756990 6.617026 7.738517 7.965070
## 
## $n
## [1] 10
## 
## $conf
## [1] 5.626976 7.607076
## 
## $out
## numeric(0)
# The two high carbonate outliers are both AY-53 (two different measurements to make sure the first batch wasn't misleading)

# Removing duplicates from the next graph
rownames(loi)
##  [1] "AA-1"  "AA-2"  "AA-3"  "AA-4"  "AA-5"  "AA-6"  "AA-7"  "AA-8"  "AA-9" 
## [10] "AA-10"
loi <- subset(loi[1:47, ])

### ggplots later with context and type

barplot(loi$c550)

barplot(loi$c950)

# Stacked barplot (x = reorder(sample, LOI))
loi_long = gather(loi, key = var, value = value, c550, c950)

ggplot(loi_long, aes(x = reorder(sample, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

TGA LOI results

# Total distribution of LOI in 550 C (1) and 950 C (2)
boxplot(tga$c550, tga$c950)

# boxplot.stats gives us the lower whisker or minimum, the interquartile range of the middle (the second and fourth values), the median (third value) and the upper whisker or maximum. 
boxplot.stats(tga$c950)
## $stats
## [1] 5.611934 6.118481 6.532557 7.126867 8.599873
## 
## $n
## [1] 11
## 
## $conf
## [1] 6.052174 7.012940
## 
## $out
## [1] 4.097341
# Removing duplicates from the next graph
#rownames(tga)
#loi <- subset(loi[1:47, ])

ggplot(tga, 
      aes(c550, c950, label = rownames(tga))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + labs(title = "Organic vs. carbonate content") +
      xlab('Organic LOI (%)') +
      ylab('Carbonate LOI (%)') +
      theme(axis.title = element_text())

barplot(tga$c550)

barplot(tga$c950)

# Stacked barplot (x = reorder(sample, LOI))
tga_long = gather(tga, key = var, value = value, c550, c950)

ggplot(tga_long, aes(x = reorder(Name, value), y = value, fill = var)) +
  geom_bar(stat = 'identity', position = 'stack') + 
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Colorimetry

# Libraries
library(dplyr); 
library(ggplot2); 
library(ggrepel);
library(rgl);
library(plot3D);
library(colordistance)

color1 <- read.table("data/color_AA.txt", header = TRUE, sep = "\t", dec = ",")
rownames(color1) <- color1$Data.Name
color <- subset(color1[, 3:5])
summary(color)
##        L               a               b        
##  Min.   :51.37   Min.   :3.880   Min.   :15.92  
##  1st Qu.:57.73   1st Qu.:4.402   1st Qu.:16.49  
##  Median :58.79   Median :4.555   Median :16.93  
##  Mean   :58.18   Mean   :5.067   Mean   :17.32  
##  3rd Qu.:59.58   3rd Qu.:4.695   3rd Qu.:17.32  
##  Max.   :61.57   Max.   :8.670   Max.   :21.47
ggplot(color, 
      aes(a, b, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) + 
      labs(title = "Color (a*b)") +
      xlab("a D65") +
      ylab("b D65") 

ggplot(color1,
      aes(X, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "Luminance") +
      ylab("L* (D65)") +
      xlab("Sample number") 

ggplot(color,
      aes(a, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "A versus luminance") +
      ylab("L* (D65)") +
      xlab("a") 

ggplot(color,
      aes(b, L, label = rownames(color))) +
      geom_point(size=2) +
      geom_text_repel(size=3) +     
      labs(title = "B versus luminance") +
      ylab("L* (D65)") +
      xlab("b") 

# Colorful 3D plot with drop lines
scatter3D(color$a, color$b, color$L, 
          xlab = "a* (D65)", ylab = "b* (D65)", zlab="L* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$a, color$b, color$L,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)

scatter3D(color$L, color$a, color$b, 
          xlab = "L* (D65)", ylab = "a* (D65)", zlab="b* (D65)",
          phi = 0, bty = "g",  type = "h", 
          ticktype = "detailed", pch = 19, cex = 1.2)
          text3D(color$L, color$a, color$b,  labels = rownames(color),
          add = TRUE, colkey = FALSE, cex = 0.7)